{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### script to tag articles using the resolved enities collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bson.objectid import ObjectId"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = MongoClient(host='act4dgem.cse.iitd.ac.in')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "mediaDb = client.get_database('media-db')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "articles = mediaDb.get_collection('2L_articles')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "resolved_entities = mediaDb.get_collection('2L_articles_resolved_entities')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "taggedArticles = dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateText(objectId, text, entype, offset, length):\n",
    "    if entype == 'Person':\n",
    "        otag = '<PER>'\n",
    "        ctag = '</PER>'\n",
    "    elif entype == 'Company' or entype == 'Organization':\n",
    "        otag = '<ORG>'\n",
    "        ctag = '</ORG>'\n",
    "    #elif entype == 'Country' or entype == 'City' or entype =='ProvinceOrState' or entype =='Continent':\n",
    "    elif entype == 'City':\n",
    "        otag = '<LOC>'\n",
    "        ctag = '</LOC>'\n",
    "\n",
    "    elif entype == 'ProvinceOrState':\n",
    "        otag = '<STA>'\n",
    "        ctag = '</STA>'\n",
    "        \n",
    "    else :\n",
    "        otag = '<MISC>'\n",
    "        ctag = '</MISC>'\n",
    "    p1 = text[:offset]\n",
    "    p2 = otag + str(objectId) + ctag\n",
    "    p3 = text[offset+length:]\n",
    "\n",
    "    shift = len(p2) - length\n",
    "\n",
    "    return (p1 + p2 + p3, shift)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def updateOffset(entities, old_offset, shift):\n",
    "    #print 'Updated Offset'\n",
    "    for e in entities:\n",
    "        for inst in e['instances']:\n",
    "            #print inst\n",
    "            if inst['offset'] > old_offset:\n",
    "                inst['offset']+=shift\n",
    "    return entities\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def updateEntityData(entity):\n",
    "    global articles,taggedArticles\n",
    "    articleIds = entity['articleIds']\n",
    "    names = entity['aliases']\n",
    "    \n",
    "    for articleId in articleIds:\n",
    "        if articleId in taggedArticles:\n",
    "            text = taggedArticles[articleId]['text']\n",
    "            try:\n",
    "                entities = taggedArticles[articleId]['entities']\n",
    "            except:\n",
    "                continue\n",
    "        else:\n",
    "            a = articles.find({'_id':ObjectId(articleId)}, no_cursor_timeout=True)\n",
    "            if a.count()==0:\n",
    "                print(\"Article Not Found\")\n",
    "                print(articleId)\n",
    "                continue\n",
    "            a = a[0]\n",
    "            text = a['text']\n",
    "            try:\n",
    "                entities = a['entities']\n",
    "            except:\n",
    "                continue\n",
    "        for e in entities:\n",
    "            if e['name'] in names:\n",
    "                for inst in e['instances']:\n",
    "                    old_offset = inst['offset']\n",
    "                    (text, shift) = updateText(entity['_id'], text, e['type'], inst['offset'], inst['length'])\n",
    "                    entities = updateOffset(entities, old_offset, shift)\n",
    "        if articleId not in taggedArticles:\n",
    "            taggedArticles[articleId] = a    \n",
    "        taggedArticles[articleId]['text'] = text\n",
    "        taggedArticles[articleId]['entities'] = entities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/manikaran/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:15: DeprecationWarning: count is deprecated. Use Collection.count_documents instead.\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "for e in entities.find():\n",
    "    updateEntityData(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../files/resolvedArticles.txt', 'w') as f:\n",
    "    for a in taggedArticles:\n",
    "        print(taggedArticles[a]['text'], file=f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
